import numpy as np

from zkl_llmpt_iterator.tokenizer.tokenizer import TextTokenizer


class Utf8Tokenizer(TextTokenizer):
    @property
    def vocab_tokens_n(self) -> int:
        return 256

    def encode(self, text: str) -> np.ndarray:
        tokens = np.frombuffer(bytes(text, "utf8"), dtype=np.uint8)
        return np.asarray(tokens, dtype=np.int64)

    def decode(self, tokens: np.ndarray) -> str:
        tokens = np.asarray(tokens, dtype=np.uint8)
        return str(tokens.tobytes(), "utf8", errors="ignore")
